Import Packages
library(ggplot2)
library(plotly)
library(data.table)
Import Data and look at first couple records
setwd("F:/kaggle/Mercedes-Benz")
sample_submission <- fread("data/sample_submission.csv")
train <- fread("data/train.csv")
test <- fread("data/test.csv")
train[1:2,]
# 1 - ID
# 2 - Response Varuable (Time in Seconds on Test Stand)
# 3-10 String Option Codes
# 11-385 0/1 values based on option codes
# X7 and X9 are missinf for some reason.
Calculate Mean and SD for Response Variable (Y). This is the time (sec) for a vehicle on the MB test station.
mean(train[,y])
[1] 100.6693
sd(train[,y])
[1] 12.67938
Create plots for the Response Variable. Sorted by ID.
plot_ly(y=train$y, type="scatter")
Create plots for the Response Variable. Sorted by the Response Variable.
plot_ly(y=train[order(y),]$y, type="scatter")
Create histogram for the Response Variable.
plot_ly(x=train$y, type="histogram")
Create histogram for the LOG(Response Variable).
plot_ly(x=log(train$y), type="histogram")
Frequency tables for the first 10 variables (All with String Codes)
cbind(train[,.N,by=X0],Mean_Y=round(train[,mean(y),by=X0]$V1,1))[order(-N)]
cbind(train[,.N,by=X1],Mean_Y=round(train[,mean(y),by=X1]$V1,1))[order(-N)]
cbind(train[,.N,by=X2],Mean_Y=round(train[,mean(y),by=X2]$V1,1))[order(-N)]
cbind(train[,.N,by=X3],Mean_Y=round(train[,mean(y),by=X3]$V1,1))[order(-N)]
cbind(train[,.N,by=X4],Mean_Y=round(train[,mean(y),by=X4]$V1,1))[order(-N)]
cbind(train[,.N,by=X5],Mean_Y=round(train[,mean(y),by=X5]$V1,1))[order(-N)]
cbind(train[,.N,by=X6],Mean_Y=round(train[,mean(y),by=X6]$V1,1))[order(-N)]
cbind(train[,.N,by=X8],Mean_Y=round(train[,mean(y),by=X8]$V1,1))[order(-N)]#where is 7
#where is 9
Frequency plots for the first 10 variables (All with String Codes)
# column_names_for_option_plots_string_codes <- colnames(train)[3:5] #3:12
# for(i in column_names_for_option_plots_string_codes){
# x_values <- as.data.frame(train[,.N,by=i][order(-N)])[,i]
# y_values <- as.data.frame(train[,.N,by=i][order(-N)])[,'N']
# plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar")
##Couldn't get loop plotting to work => https://github.com/ropensci/plotly/issues/273
#Plotting manually
column_names_for_option_plots_string_codes <- colnames(train)[3:10] #3:10
i<-1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
Frequency table for the remaining variables (All with 0/1 Coding)
column_names_for_summary <- colnames(train)[11:length(colnames(train))]
Variable <- c()
N_0 <- c()
N_1 <- c()
Mean_0 <- c()
Mean_1 <- c()
for(i in column_names_for_summary){
j<-1
Variable <- c(Variable,i)
N_0 <- c(N_0,train[,.N,by=i][1]$N)
N_1 <- c(N_1,train[,.N,by=i][2]$N)
Mean_0 <- c(Mean_0,round(train[,mean(y),by=i][1]$V1,1))
Mean_1 <- c(Mean_1,round(train[,mean(y),by=i][2]$V1,1))
j <- j+1
}
# N_0[is.na(N_0)==TRUE] <- 0
# N_1[is.na(N_1)==TRUE] <- 0
# Mean_0[is.na(Mean_0)==TRUE] <- 0
# Mean_1[is.na(Mean_1)==TRUE] <- 0
summary_results <- as.data.frame(cbind(Variable,N_0=N_0,N_1,Mean_0,Mean_1), stringsAsFactors = FALSE)
summary_results$N_0 <- as.integer(summary_results$N_0)
summary_results$N_1 <- as.integer(summary_results$N_1)
summary_results$Mean_0 <- as.numeric(summary_results$Mean_0)
summary_results$Mean_1 <- as.numeric(summary_results$Mean_1)
summary_results$Delta_Mean <- summary_results$Mean_1 - summary_results$Mean_0
summary_results
After Looking at Kaggle, checked for Duplicate fileds - Added Data to Frequency table for the remaining variables (All with 0/1 Coding).
train_2 <- train[, !duplicated(t(train))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006
duplicate_column <- c()
for(i in duplicated(t(train))){
duplicate_column <- c(duplicate_column,i)
}
duplicate_column <- duplicate_column[11:length(duplicate_column)]
summary_results$duplicate_column <- duplicate_column
summary_results_decreasing <- summary_results[order(-N_1),]
summary_results
summary_results_decreasing
List of Duplicated Columns
summary_results_decreasing[(summary_results_decreasing$duplicate_column==TRUE),]
Create Response Plots for all Binary Options
train_df <- as.data.frame(train)
train_df_dedupe <- train_df[, !duplicated(t(train_df))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006
column_names_for_binary_plots <- colnames(train_df_dedupe)[11:length(colnames(train_df_dedupe))]
l <- htmltools::tagList()
for (i in 1:length(column_names_for_binary_plots)) { #
plot_variable <- column_names_for_binary_plots[i]
l[[i]] <- plot_ly(x=train_df_dedupe[,plot_variable],y=train_df_dedupe$y, type="box", height=300) %>% layout(title=plot_variable)
}
l
---
title: "Mercedes-Benz EDA Notebook"
output: html_notebook
author: Jeff Hedberg
date: 5-June-2017    
---
<br>

#### Import Packages
```{r,message=FALSE,warning=FALSE}
library(ggplot2)
library(plotly)
library(data.table)
```

<br>

####  Import Data and look at first couple records
```{r}
setwd("F:/kaggle/Mercedes-Benz")

sample_submission <- fread("data/sample_submission.csv")
train <- fread("data/train.csv")
test <- fread("data/test.csv")

train[1:2,]

# 1 - ID
# 2 - Response Varuable (Time in Seconds on Test Stand)
# 3-10  String Option Codes
# 11-385 0/1 values based on option codes

# X7 and X9 are missinf for some reason.
```
<br>

####  Calculate Mean and SD for Response Variable (Y).  This is the time (sec) for a vehicle on the MB test station.
```{r}
mean(train[,y])
sd(train[,y])
```
<br>

####  Create plots for the Response Variable.  Sorted by ID.
```{r, message=FALSE}
plot_ly(y=train$y, type="scatter")
```
<br>

####  Create plots for the Response Variable.  Sorted by the Response Variable.
```{r, message=FALSE}
plot_ly(y=train[order(y),]$y, type="scatter")
```
<br>

####  Create histogram for the Response Variable.
```{r}
 plot_ly(x=train$y, type="histogram")
```
<br>

####  Create histogram for the LOG(Response Variable).
```{r}
 plot_ly(x=log(train$y), type="histogram")
```
<br>

####  Frequency tables for the first 10 variables (All with String Codes)
```{r}
cbind(train[,.N,by=X0],Mean_Y=round(train[,mean(y),by=X0]$V1,1))[order(-N)]
cbind(train[,.N,by=X1],Mean_Y=round(train[,mean(y),by=X1]$V1,1))[order(-N)]
cbind(train[,.N,by=X2],Mean_Y=round(train[,mean(y),by=X2]$V1,1))[order(-N)]
cbind(train[,.N,by=X3],Mean_Y=round(train[,mean(y),by=X3]$V1,1))[order(-N)]
cbind(train[,.N,by=X4],Mean_Y=round(train[,mean(y),by=X4]$V1,1))[order(-N)]
cbind(train[,.N,by=X5],Mean_Y=round(train[,mean(y),by=X5]$V1,1))[order(-N)]
cbind(train[,.N,by=X6],Mean_Y=round(train[,mean(y),by=X6]$V1,1))[order(-N)]
cbind(train[,.N,by=X8],Mean_Y=round(train[,mean(y),by=X8]$V1,1))[order(-N)]#where is 7
#where is 9

```
<br>

####  Frequency plots for the first 10 variables (All with String Codes)
```{r}
# column_names_for_option_plots_string_codes <- colnames(train)[3:5] #3:12
# for(i in column_names_for_option_plots_string_codes){
#   x_values <- as.data.frame(train[,.N,by=i][order(-N)])[,i]
#   y_values <- as.data.frame(train[,.N,by=i][order(-N)])[,'N']
#   plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar")
##Couldn't get loop plotting to work => https://github.com/ropensci/plotly/issues/273

#Plotting manually
column_names_for_option_plots_string_codes <- colnames(train)[3:10] #3:10
i<-1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)
i<-i+1
current_colname <- column_names_for_option_plots_string_codes[i]
x_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,current_colname]
y_values <- as.data.frame(train[,.N,by=current_colname][order(-N)])[,'N']
plot_ly(x=as.list(x_values),y=as.list(y_values), type="bar") %>% layout(title=current_colname)

```
<br>

####  Frequency table for the remaining variables (All with 0/1 Coding)
```{r}
column_names_for_summary <- colnames(train)[11:length(colnames(train))]
Variable <- c()
N_0 <- c()
N_1 <- c()
Mean_0 <- c()
Mean_1 <- c()

for(i in column_names_for_summary){
  j<-1
  Variable <- c(Variable,i)
  N_0 <- c(N_0,train[,.N,by=i][1]$N)
  N_1 <- c(N_1,train[,.N,by=i][2]$N)
  Mean_0 <- c(Mean_0,round(train[,mean(y),by=i][1]$V1,1))
  Mean_1 <- c(Mean_1,round(train[,mean(y),by=i][2]$V1,1))
  j <- j+1
}

# N_0[is.na(N_0)==TRUE] <- 0
# N_1[is.na(N_1)==TRUE] <- 0
# Mean_0[is.na(Mean_0)==TRUE] <- 0
# Mean_1[is.na(Mean_1)==TRUE] <- 0

summary_results <- as.data.frame(cbind(Variable,N_0=N_0,N_1,Mean_0,Mean_1), stringsAsFactors = FALSE)
summary_results$N_0 <- as.integer(summary_results$N_0)
summary_results$N_1 <- as.integer(summary_results$N_1)
summary_results$Mean_0 <- as.numeric(summary_results$Mean_0)
summary_results$Mean_1 <- as.numeric(summary_results$Mean_1)
summary_results$Delta_Mean <- summary_results$Mean_1 - summary_results$Mean_0
summary_results
```
<br>

####  After Looking at Kaggle, checked for Duplicate fileds - Added Data to Frequency table for the remaining variables (All with 0/1 Coding).
```{r}
train_2 <- train[, !duplicated(t(train))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006
duplicate_column <- c()
for(i in duplicated(t(train))){
  duplicate_column <- c(duplicate_column,i)
}

duplicate_column <- duplicate_column[11:length(duplicate_column)]

summary_results$duplicate_column <- duplicate_column

summary_results_decreasing <- summary_results[order(-N_1),]

summary_results
summary_results_decreasing
```
<br>

####  List of Duplicated Columns
```{r}
summary_results_decreasing[(summary_results_decreasing$duplicate_column==TRUE),]
```
<br>

####  Create Response Plots for all Binary Options
```{r}
train_df <- as.data.frame(train)
train_df_dedupe <- train_df[, !duplicated(t(train_df))] #remove duplicated fields ... from raddar@Kaggle => https://www.kaggle.com/c/mercedes-benz-greener-manufacturing/discussion/34006

column_names_for_binary_plots <- colnames(train_df_dedupe)[11:length(colnames(train_df_dedupe))]

l <- htmltools::tagList()
for (i in 1:length(column_names_for_binary_plots)) {  #
  plot_variable <- column_names_for_binary_plots[i]
  l[[i]] <- plot_ly(x=train_df_dedupe[,plot_variable],y=train_df_dedupe$y, type="box", height=300) %>% layout(title=plot_variable)
}
l

```
<br>

